{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 08 ``Numpy`` 中的聚合操作"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## sum"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "54.406852984527816"
     },
     "metadata": {},
     "execution_count": 3
    }
   ],
   "source": [
    "import numpy as np\n",
    "\n",
    "L = np.random.random(100)\n",
    "sum(L)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "54.40685298452781"
     },
     "metadata": {},
     "execution_count": 4
    }
   ],
   "source": [
    "np.sum(L)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "78 ms ± 212 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n819 µs ± 11.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n"
    }
   ],
   "source": [
    "# np中的sum的效率高！\n",
    "big_array = np.random.rand(1000000)\n",
    "%timeit sum(big_array)\n",
    "%timeit np.sum(big_array)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## min, max"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "9.501370793474351e-07"
     },
     "metadata": {},
     "execution_count": 6
    }
   ],
   "source": [
    "np.min(big_array)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "0.9999991543064245"
     },
     "metadata": {},
     "execution_count": 7
    }
   ],
   "source": [
    "np.max(big_array)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "9.501370793474351e-07"
     },
     "metadata": {},
     "execution_count": 8
    }
   ],
   "source": [
    "# big_array中不太全，尽量使用np中的方法\n",
    "big_array.min()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "0.9999991543064245"
     },
     "metadata": {},
     "execution_count": 9
    }
   ],
   "source": [
    "big_array.max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "500133.52839085093"
     },
     "metadata": {},
     "execution_count": 10
    }
   ],
   "source": [
    "big_array.sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 多维度聚合"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "array([[ 0,  1,  2,  3],\n       [ 4,  5,  6,  7],\n       [ 8,  9, 10, 11],\n       [12, 13, 14, 15]])"
     },
     "metadata": {},
     "execution_count": 11
    }
   ],
   "source": [
    "X = np.arange(16).reshape(4,-1)\n",
    "X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "120"
     },
     "metadata": {},
     "execution_count": 12
    }
   ],
   "source": [
    "# 不管维度，默认所有元素的和\n",
    "np.sum(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "array([24, 28, 32, 36])"
     },
     "metadata": {},
     "execution_count": 13
    }
   ],
   "source": [
    "# axis=0是每一列的和。axis=0是指沿着行这个方向进行运算\n",
    "# axis传入的可以理解为被压缩掉的维度。0就是压缩掉行，只剩下列\n",
    "np.sum(X, axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "array([ 6, 22, 38, 54])"
     },
     "metadata": {},
     "execution_count": 14
    }
   ],
   "source": [
    "# axis=1是每一行的和。axis=1是指沿着列这个方向进行运算\n",
    "np.sum(X, axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "*注意：axis描述的是将要被压缩的维度。*"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 其他聚合操作"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "0"
     },
     "metadata": {},
     "execution_count": 15
    }
   ],
   "source": [
    "np.prod(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "2004189184"
     },
     "metadata": {},
     "execution_count": 16
    }
   ],
   "source": [
    "# 所有元素的乘积\n",
    "np.prod(X + 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "7.5"
     },
     "metadata": {},
     "execution_count": 17
    }
   ],
   "source": [
    "# 均值\n",
    "np.mean(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "7.5"
     },
     "metadata": {},
     "execution_count": 18
    }
   ],
   "source": [
    "# 中位数\n",
    "np.median(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "3.2"
     },
     "metadata": {},
     "execution_count": 19
    }
   ],
   "source": [
    "# 异常值对平均数的影响过大\n",
    "v = np.array([1, 1, 2, 2, 10])\n",
    "np.mean(v)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "2.0"
     },
     "metadata": {},
     "execution_count": 20
    }
   ],
   "source": [
    "np.median(v)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "0.49975945101334784"
     },
     "metadata": {},
     "execution_count": 21
    }
   ],
   "source": [
    "# %50的百分位\n",
    "np.percentile(big_array, q=50)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "0.49975945101334784"
     },
     "metadata": {},
     "execution_count": 22
    }
   ],
   "source": [
    "np.median(big_array)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "0.9999991543064245"
     },
     "metadata": {},
     "execution_count": 23
    }
   ],
   "source": [
    "# 最大值\n",
    "np.percentile(big_array, q=100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "0.9999991543064245"
     },
     "metadata": {},
     "execution_count": 24
    }
   ],
   "source": [
    "np.max(big_array)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "9.501370793474351e-07\n0.250266373380963\n0.49975945101334784\n0.7500123443234672\n0.9999991543064245\n"
    }
   ],
   "source": [
    "# 一边看这几个百分点就能看出分布\n",
    "for percent in [0, 25, 50, 75, 100]:\n",
    "    print(np.percentile(big_array, q=percent))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "0.08331921851401417"
     },
     "metadata": {},
     "execution_count": 26
    }
   ],
   "source": [
    "# 方差\n",
    "np.var(big_array)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "0.288650685975305"
     },
     "metadata": {},
     "execution_count": 27
    }
   ],
   "source": [
    "# 标准差\n",
    "np.std(big_array)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "x = np.random.normal(0, 1, 1000000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "0.0009186836023089008"
     },
     "metadata": {},
     "execution_count": 29
    }
   ],
   "source": [
    "np.mean(x)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "0.9995089112318769"
     },
     "metadata": {},
     "execution_count": 30
    }
   ],
   "source": [
    "np.std(x)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7-final"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}